The purpose of exploratory analysis is to perform initial investigations on the data so as to discover patterns, identify any anomalies, to test initial hypothesis and check assumptions made at the start of the project.
In this section of the project we will ask two types of questions:
Variation being the tendency for values of a variable to change from measurement to measurement, and covariance is the tendency for two or more variables to vary together in a related way.
pacman::p_load(
tidyverse,
here,
RColorBrewer,
lubridate,
scales,
GGally,
stats,
corrplot,
mice,
VIM
)
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))
Rows: 327346 Columns: 38
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): origin, origin_name, dest, dest_name, carrier, carrier_name, tailnum, manufacturer, model, engine, type
dbl (22): dep_delay, arr_delay, air_time, distance, flight, engines, seats, aircraft_age, lat, lon, alt, wind_di...
dttm (5): dep_time, sched_dep_time, arr_time, sched_arr_time, time_hour
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
flights_dt %>%
group_by(origin) %>%
ggplot() +
aes(x = origin, fill = origin) +
geom_bar() +
scale_fill_manual(values = cbPalette) +
labs(
x = "airport",
y = "departure numbers",
title = "Departure numbers by airport"
) +
guides(fill = "none") +
theme_bw()
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = month, y = mean_delay, colour = origin) +
geom_line(alpha = 0.6) +
geom_point(alpha = 0.8) +
labs(
x = "month",
y = "mean delay (minutes)",
title = " Mean delay by month",
colour = "airport"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(no_of_delays = n(), .groups = "drop") %>%
ggplot() +
aes(x = month, y = no_of_delays, colour = origin) +
geom_line(alpha = 0.6) +
geom_point(alpha = 0.8) +
labs(
x = "month",
y = "number of delays",
title = " Monthly trend of departure delays",
colour = "airport"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()
flights_dt %>%
mutate(month = month(sched_dep_time, label = TRUE)) %>%
group_by(month, origin) %>%
summarise(no_of_flights = n(), .groups = "drop") %>%
ggplot() +
aes(x = month, y = no_of_flights, fill = origin) +
geom_col(position = "dodge", alpha = 0.8) +
labs(
x = "month",
y = "number of departures",
title = "Departure numbers by month",
fill = "airport"
) +
scale_fill_manual(values = cbPalette) +
theme_bw()
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(carrier_name, origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
geom_col(alpha = 0.8) +
facet_wrap(~ origin) +
labs(
x = "carrier",
y = "mean delay (minutes)",
title = "Mean departure delay by carrier",
fill = "airport"
) +
theme_bw() +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
coord_flip()
flights_dt %>%
filter(arr_delay > 0) %>%
group_by(carrier_name, origin) %>%
summarise(mean_delay = mean(arr_delay), .groups = "drop") %>%
ggplot() +
aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
geom_col(alpha = 0.8) +
facet_wrap(~ origin) +
labs(
x = "carrier",
y = "mean delay (minutes)",
title = "Mean arrival delay by carrier",
fill = "airport"
) +
theme_bw() +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
coord_flip()
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
aes(x = day, y = mean_delay, colour = origin) +
geom_point(alpha = 0.8) +
labs(
x = "date",
y = "mean delay (minutes)",
title = "Mean departure delay by month"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
select(dep_delay, hour) %>%
group_by(hour) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
aes(x = hour, y = mean_delay) +
geom_point(alpha = 0.8, colour = "#56B4E9") +
geom_smooth(se = FALSE, colour = "#E69F00") +
labs(
x = "hour",
y = "mean delay (minutes)",
title = "Mean departure delay by departure time"
) +
theme_bw() +
scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
mutate(weekday = wday(sched_dep_time, label = TRUE)) %>%
group_by(weekday) %>%
summarise(mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = weekday, y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "weekday",
y = "mean departure delay (minutes)",
title = "Departure delay by weekday"
) +
theme_bw()
flights_dt %>%
filter(origin == "EWR") %>%
group_by(dest, dest_name) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot() +
aes(x = reorder(dest_name, count), y = count) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "destination",
y = "number of flights per year",
title = "Destinations from Newark Int."
) +
theme_bw() +
coord_flip()
flights_dt %>%
drop_na(dest_name) %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(dest, dest_name) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_delay)) %>%
head(10) %>%
ggplot() +
aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "destination",
y = "mean delay (minutes)",
title = "Destinations with longest delays"
) +
theme_bw() +
coord_flip()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_visibility = mean(visib, na.rm = TRUE),
mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = mean_visibility, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean visibility (miles)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by visibility"
) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_wind_speed, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean wind speed (mph)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by wind speed"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_wind_dir, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean wind direction (degrees)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by wind direction"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_humidity = mean(humid, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_humidity, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean humidity (%)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by humidity"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_temp = mean(temp, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_temp, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean temperature (degf)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by temperature"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_dewpoint, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean dewpoint (degF)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by dewpoint"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_precip = mean(precip, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_precip, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean precipitation (inches)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by precipitation"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_pressure = mean(pressure, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_pressure, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean pressure (mbar)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by pressure"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
# subset the flights dataset so as to drop NAs, filter on only departure delays (no early departures) and only for Newark Int.
flights_weather <- flights_dt %>%
na.omit() %>%
filter(dep_delay > 0 & origin == "EWR") %>%
select(dep_delay, temp, dewp, humid, wind_dir, wind_speed, precip, pressure, visib)
# create correlation matrix
cor_matrix <- cor(flights_weather)
corrplot(cor_matrix, type = "upper", col = cbPalette,
tl.col = "black", method = "number")
ggpairs(flights_weather)
model_1_dt <- flights_dt %>%
na.omit() %>%
filter(dep_delay > 0 & origin == "EWR") %>%
select(dep_delay, temp, dewp, humid, wind_dir, wind_speed, precip, pressure, visib)
model_1 <- lm(dep_delay ~ ., data = model_1_dt)
# view results
summary(model_1)
Call:
lm(formula = dep_delay ~ ., data = model_1_dt)
Residuals:
Min 1Q Median 3Q Max
-59.90 -29.00 -16.77 10.88 815.29
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 631.659739 40.355182 15.653 < 2e-16 ***
temp 0.463713 0.135562 3.421 0.000625 ***
dewp -0.435947 0.145838 -2.989 0.002798 **
humid 0.467200 0.076075 6.141 8.26e-10 ***
wind_dir -0.017681 0.002832 -6.243 4.33e-10 ***
wind_speed 0.486040 0.054098 8.985 < 2e-16 ***
precip -8.181936 17.295980 -0.473 0.636177
pressure -0.623965 0.038058 -16.395 < 2e-16 ***
visib 0.297599 0.213728 1.392 0.163802
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 48.57 on 40553 degrees of freedom
Multiple R-squared: 0.025, Adjusted R-squared: 0.02481
F-statistic: 130 on 8 and 40553 DF, p-value: < 2.2e-16
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = humid, y = dewp) +
geom_point()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = temp, y = dewp) +
geom_point()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = visib, y = humid) +
geom_point()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = dewp, y = pressure) +
geom_point()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = humid, y = wind_speed) +
geom_point()
summary(model_3)
Call:
lm(formula = dep_delay ~ air_time + aircraft_age + alt + hour +
dewp + pressure + dewp:humid + pressure:dewp + dewp:temp +
wind_speed:humid, data = model_2_dt)
Residuals:
Min 1Q Median 3Q Max
-86.30 -27.15 -13.92 9.77 815.55
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.705e+02 7.388e+01 -6.369 1.92e-10 ***
air_time -5.057e-02 2.425e-03 -20.853 < 2e-16 ***
aircraft_age 2.366e-01 4.810e-02 4.918 8.79e-07 ***
alt 1.059e-03 2.286e-04 4.633 3.62e-06 ***
hour 1.737e+00 5.498e-02 31.597 < 2e-16 ***
dewp 2.577e+01 1.827e+00 14.105 < 2e-16 ***
pressure 4.728e-01 7.222e-02 6.547 5.93e-11 ***
dewp:humid 7.640e-03 4.174e-04 18.303 < 2e-16 ***
dewp:pressure -2.607e-02 1.768e-03 -14.745 < 2e-16 ***
dewp:temp 3.842e-03 6.260e-04 6.138 8.43e-10 ***
humid:wind_speed 7.562e-03 8.812e-04 8.581 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 47.4 on 40551 degrees of freedom
Multiple R-squared: 0.07124, Adjusted R-squared: 0.07101
F-statistic: 311.1 on 10 and 40551 DF, p-value: < 2.2e-16
glmulti_fit <- glmulti(
dep_delay ~ .,
data = flights_weather,
level = 2, # 2 = include pairwise interactions, 1 = main effects only (main effect = no pairwise interactions)
minsize = 0, # no min size of model
maxsize = -1, # -1 = no max size of model
marginality = TRUE, # marginality here means the same as 'strongly hierarchical' interactions, i.e. include pairwise interactions only if both predictors present in the model as main effects.
method = "g", # the problem is too large for exhaustive search, so search using a genetic algorithm
crit = bic, # criteria for model selection is BIC value (lower is better)
plotty = FALSE, # don't plot models as function runs
report = TRUE, # do produce reports as function runs
confsetsize = 10, # return best 100 solutions
fitfunction = lm # fit using the `lm` function
)
str(flights_decision_tree)
tibble [91,968 × 19] (S3: tbl_df/tbl/data.frame)
$ distance : num [1:91968] 1400 719 1065 2227 1023 ...
$ engines : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
$ engine : Factor w/ 6 levels "4 Cycle","Reciprocating",..: 3 3 3 3 3 3 3 3 4 3 ...
$ aircraft_age : num [1:91968] 22 9 21 13 15 31 12 20 22 19 ...
$ lat : num [1:91968] 30 42 26.1 36.1 26.7 ...
$ lon : num [1:91968] -95.3 -87.9 -80.2 -115.2 -80.1 ...
$ alt : num [1:91968] 97 668 9 2141 19 ...
$ hour : num [1:91968] 5 5 6 6 6 6 6 6 6 6 ...
$ temp : num [1:91968] 39 39 37.9 37.9 37.9 ...
$ precip : num [1:91968] 0 0 0 0 0 0 0 0 0 0 ...
$ humid : num [1:91968] 64.4 64.4 67.2 67.2 67.2 ...
$ visib : num [1:91968] 10 10 10 10 10 10 10 10 10 10 ...
$ pressure : num [1:91968] 1012 1012 1012 1012 1012 ...
$ dewp : num [1:91968] 28 28 28 28 28 ...
$ wind_dir : num [1:91968] 260 260 240 240 240 240 240 240 240 240 ...
$ wind_speed : num [1:91968] 12.7 12.7 11.5 11.5 11.5 ...
$ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
$ late_departure: Factor w/ 2 levels "Late","Ontime/Early": 1 2 2 2 1 2 2 2 2 2 ...
$ season : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
- attr(*, "na.action")= 'omit' Named int [1:25159] 4 9 14 22 26 31 49 58 59 61 ...
..- attr(*, "names")= chr [1:25159] "4" "9" "14" "22" ...
# check split of the data
flights_test %>%
janitor::tabyl(late_departure)
late_departure n percent
Late 8117 0.4413092
Ontime/Early 10276 0.5586908
flights_train %>%
janitor::tabyl(late_departure)
late_departure n percent
Late 32445 0.4409786
Ontime/Early 41130 0.5590214
conf_mat <- flights_test_pred %>%
conf_mat(truth = late_departure, estimate = pred)
conf_mat
Truth
Prediction Late Ontime/Early
Late 4484 3136
Ontime/Early 3633 7140
confusionMatrix(flights_test_pred$pred, flights_test_pred$late_departure)
Confusion Matrix and Statistics
Reference
Prediction Late Ontime/Early
Late 4484 3136
Ontime/Early 3633 7140
Accuracy : 0.632
95% CI : (0.625, 0.639)
No Information Rate : 0.5587
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.2488
Mcnemar's Test P-Value : 1.653e-09
Sensitivity : 0.5524
Specificity : 0.6948
Pos Pred Value : 0.5885
Neg Pred Value : 0.6628
Prevalence : 0.4413
Detection Rate : 0.2438
Detection Prevalence : 0.4143
Balanced Accuracy : 0.6236
'Positive' Class : Late